#---------------------------------------------------------
# Are Intermediary Constraints Priced?
# Du, Hébert, Huber
# The Review of Financial Studies 2022
#---------------------------------------------------------

#---------------------------------------------------------
# Clean raw files in US gov't bond, CDS, commodities
# Extend to 2020 December
#---------------------------------------------------------

#---------------------------------------------------------
# Set-up
#---------------------------------------------------------

if (exists('script_path')) {
  rm(list = setdiff(ls(), 'script_path'))
} else {
  args = commandArgs(trailingOnly = T)
  if (length(args) > 0) {
    script_path <- args[1]  
    rm(list = setdiff(ls(), 'script_path'))
  } else {
    rm(list = ls())
    script_path <- '~/Dropbox/ForwardArbitrage/CodeReplication/'
  } 
}

source(paste0(script_path, 'RCode/ArbFunc.R'))

#---------------------------------------------------------
# US gov't bond / Fama-French (format changed since last data extension to 2019)
#---------------------------------------------------------

us_bond <- fread(paste0(script_path, 'DataSkeleton/AssetClasses/US_govt_bonds_07202021.csv'))
us_bond <- us_bond[TTERMTYPE %in% c(12012, 12024, 12036, 12048, 12060, 6120, 6121)] #<12m, 12-24, 24-36, 36-48, 48-60, 60-120, 120+
conversion <- data.table(TTERMTYPE = c(12012, 12024, 12036, 12048, 12060, 6120, 6121),
                         port_name = paste('US_gov', sprintf('%02d', 1:7), sep = '_'))
us_bond[, port_name := conversion$port_name[match(us_bond$TTERMTYPE, conversion$TTERMTYPE)]]
us_bond[, Date := ymd(MCALDT)]
bonds_wide <- dcast(us_bond, Date ~ port_name, value.var = 'TMEWRETD')

fwrite(bonds_wide, paste0(script_path, 'OutputInterim/Clean_US_bonds.csv'))

#---------------------------------------------------------
# CDS
#---------------------------------------------------------

cds <- fread(paste0(script_path, 'DataSkeleton/AssetClasses/cds_total_returns_06082021.csv'))
names(cds) <- c('daily_date_cds', 'IG', 'HY', 'EM', 'EU', 'Cross')
cds[, `:=` (daily_date_cds = dmy(daily_date_cds), IG = as.numeric(IG), HY = as.numeric(HY), EM = as.numeric(EM), EU = as.numeric(EU), Cross = as.numeric(Cross))]
cds <- cds[complete.cases(cds)]

cds2 <- fread(paste0(script_path, 'DataSkeleton/AssetClasses/cdx_total_returns_add_07192021.csv'))
names(cds2) <- c('daily_date_cds', 'Senior', 'Sub')
cds2[, `:=` (daily_date_cds = dmy(daily_date_cds), Senior = as.numeric(Senior), Sub = as.numeric(Sub))]

cds <- merge(cds, cds2, by = 'daily_date_cds', all.x = T)
cds[, yyyymm := as.integer(paste0(year(daily_date_cds), sprintf('%02d', month(daily_date_cds))))]
cds <- cds[cds[, .I[which.max(daily_date_cds)], .(yyyymm)]$V1]

orig_cds_names <- c('IG', 'HY', 'EM', 'EU', 'Cross', 'Senior', 'Sub')
ret_cds_names <- paste(orig_cds_names, 'return', sep = '_')
cds[, (ret_cds_names) := lapply(.SD, function(x) x / shift(x, n = 1, type = 'lag') - 1), .SDcols = orig_cds_names]
cds <- cds[complete.cases(cds)]

fwrite(cds, paste0(script_path, 'OutputInterim/Clean_cds.csv'))

#---------------------------------------------------------
# Commodity
#---------------------------------------------------------

# Objective is to include the 23 commodities included in He Kelly Manela JFE 2017 (HKM)
# Since HKM does not specify the commodities used, we include those that show the highest correlation with the HKM panel
comm1 <- data.table(read_excel(paste0(script_path, 'DataSkeleton/AssetClasses/bloomberg_commodities_06082021.xlsx'), sheet = 'Sheet2'))
comm1[, daily_date_commodity := ymd(Dates)]
orig_comm_names <- names(comm1)[grepl('tr_', names(comm1))]
alt_comm_names <- sapply(orig_comm_names, function(x) substr(x, 4, 99))
comm1 <- comm1[, c('daily_date_commodity', orig_comm_names), with = F]
names(comm1) <- c('daily_date_commodity', alt_comm_names)
comm1[, (alt_comm_names) := lapply(.SD, function(x) as.numeric(x)), .SDcols = alt_comm_names]
comm1 <- comm1[complete.cases(comm1)]
comm1[, yyyymm := as.integer(paste0(year(daily_date_commodity), sprintf('%02d', month(daily_date_commodity))))]
comm1[, ME := max(daily_date_commodity), .(yyyymm)]
comm1 <- comm1[daily_date_commodity == ME, ]
comm1[, (alt_comm_names) := lapply(.SD, function(x) x / shift(x, n = 1, type = 'lag') - 1), .SDcols = alt_comm_names]
comm1 <- comm1[, c('yyyymm', alt_comm_names), with = F]

## Price download has a bit more scattered entries, use the last entry in each commodity to calculate return
comm2 <- data.table(read_excel(paste0(script_path, 'DataSkeleton/AssetClasses/add_bloomberg_commodities_07192021.xlsx'), sheet = 'Sheet2'))
comm2[, daily_date_commodity := ymd(Dates)]
orig_comm_names <- names(comm2)[2:6]
alt_comm_names <- c("Canola",    "Lumber",    "Oats",      "Rice",      "Palladium")
comm2 <- comm2[, c('daily_date_commodity', orig_comm_names), with = F]
names(comm2) <- c('daily_date_commodity', alt_comm_names)
comm2[, (alt_comm_names) := lapply(.SD, function(x) as.numeric(x)), .SDcols = alt_comm_names]
list_comm <- list()
for (i in 1:length(alt_comm_names)) {
  temp <- comm2[!is.na(get(alt_comm_names[i])), c('daily_date_commodity', alt_comm_names[i]), with = F]
  temp[, yyyymm := as.integer(paste0(year(daily_date_commodity), sprintf('%02d', month(daily_date_commodity))))]
  temp[, ME := max(daily_date_commodity), .(yyyymm)]
  temp <- temp[daily_date_commodity == ME, ]
  temp[, (alt_comm_names[i]) := lapply(.SD, function(x) x / shift(x, n = 1, type = 'lag') - 1), .SDcols = alt_comm_names[i]]
  list_comm[[i]] <- temp[, c('yyyymm', alt_comm_names[i]), with = F]
}
comm2 <- Reduce(function(x, y) merge(x, y, by = 'yyyymm', all = T), list_comm)
comm_combined <- merge(comm2, comm1, by = 'yyyymm', all = T)

# Check return correlations with HKM / re-label HKM
hkm <- fread(paste0(script_path, 'DataSkeleton/AssetClasses/He_Kelly_Manela_Factors_And_Test_Assets_monthly.csv'))
hkm_names <- names(hkm)[grepl('Commod_', names(hkm))]
hkm <- hkm[, c('yyyymm', hkm_names), with = F]

start_commod <- 199102
end_commd <- 201212
hkm_sub <- hkm[yyyymm >= start_commod, ]
commod_sub <- comm_combined[yyyymm >= start_commod & yyyymm <= end_commd, ]
corr <- matrix(, nrow = ncol(hkm_sub) - 1, ncol = ncol(commod_sub) - 1)
colnames(corr) <- names(commod_sub)[-1]
for (i in 2:ncol(hkm_sub)) {
  for (j in 2:ncol(commod_sub)) {
    corr[i - 1, j - 1] <- cor(hkm_sub[, (i), with = F], commod_sub[, (j), with = F], use = 'pairwise.complete.obs')
  }
}

impute_names <- matrix(, nrow = ncol(hkm_sub) - 1, ncol = 3)
for (i in 1:nrow(corr)) {
  impute_names[i, 1] <- i
  impute_names[i, 2] <- max(corr[i, ])
  impute_names[i, 3] <- colnames(corr)[which(corr[i, ] == max(corr[i, ]))]
}

impute_names <- data.table(impute_names)
names(impute_names) <- c('Portfolio', 'Corr', 'Commod')
true_names <- impute_names$Commod
names(hkm_sub) <- c('yyyymm', true_names)

comm <- merge(comm2[yyyymm >= start_commod, c('yyyymm', names(comm2)[names(comm2) %in% true_names]), with = F],
              comm1[yyyymm >= start_commod, c('yyyymm', names(comm1)[names(comm1) %in% true_names]), with = F],
              by = 'yyyymm', all = T)

comm <- comm[yyyymm >= 199102 & yyyymm <= 202106]
comm[, lapply(.SD, function(x) yyyymm[which(is.na(x))])]

# Paste in HKM where the commodity is missing for Oats (199606) and Rice (199205)
comm[which(yyyymm == 199205), Rice := hkm_sub[which(yyyymm == 199205), Rice]]
comm[which(yyyymm == 199606), Oats := hkm_sub[which(yyyymm == 199606), Oats]]

fwrite(comm, paste0(script_path, 'OutputInterim/Clean_commodities.csv'))
